import jieba
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
YaHei = FontProperties(fname='/Users/zhaojichang/Library/Fonts/Microsoft-YaHei.ttf')
import numpy as np


# 定义语料库（省略文本预处理）
stopset = set(['[',']','@'])
sentences = []
with open('../python2/weibo.txt','r') as f:
    for line in f:
        sen = [w for w in jieba.cut(line.strip().split('\t')[1]) if w not in stopset]
        #print(sen)
        sentences.append(sen)
print(f'load {len(sentences)} tweets...')
#sentences = [["节拍", "不对", "也", "无所谓"],
#             ["想唱", "就唱", "想睡", "就睡"],
#             ["你", "问", "快乐", "在哪里", "快乐", "在这里"],
#             ["你", "问", "快乐", "在哪里", "现在", "就", "告诉", "你"]]


# 建立Word2Vec模型
model = Word2Vec(sentences, vector_size=300, window=5, min_count=1)

# 获取最相关的10个词和最不相关的10个词
most_similar = model.wv.most_similar("春天", topn=10)
least_similar = model.wv.most_similar(negative=["春天"], topn=10)

print(most_similar)
print(least_similar)

# 模型的保存
model.save("word2vec.model")